library(data.table)
library(Rtsne)
library(R.utils)
library(ggplot2)
library(ggcorrplot)
library(umap)
theme_set(theme_bw())
The original data files were downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE120575. They were split into two files to make sure the file size is smaller than 100MB, and the original file has a 2nd header line for time points: “Pre_P1”, “Post_P1”, “Post_P1_2”, and that row was deleted in file “GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz”.
fnm1 = "GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz"
fnm2 = "GSE120575_Sade_Feldman_melanoma_single_cells_2.txt.gz"
sf_tpm1 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm1),
fill = TRUE, header = TRUE)
sf_tpm2 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm2),
fill = TRUE, drop = 16293, col.names = colnames(sf_tpm1))
dim(sf_tpm1)
## [1] 14998 16292
dim(sf_tpm2)
## [1] 40739 16292
sf_tpm1[1:2,1:5]
sf_tpm2[1:2,1:5]
ls_tpm = list(sf_tpm1,sf_tpm2)
sf_tpm = rbindlist(ls_tpm)
rm(ls_tpm)
rm(sf_tpm1)
rm(sf_tpm2)
dim(sf_tpm) # 55737 x 16292
## [1] 55737 16292
sf_tpm[1:2,1:5]
cell.names = colnames(sf_tpm)[-1]
gene.names = sf_tpm$V1
rownames(sf_tpm) = gene.names
sf_tpm$V1 = NULL
Split the cells to reduce memory usage.
grouping = rep(1:82, each=200)[1:length(cell.names)]
reads.per.cell = NULL
for (i in 1:82) {
ss = cell.names[grouping == i]
temp.reads = unlist(sf_tpm[, lapply(.SD, sum), .SDcols = ss])
reads.per.cell = c(reads.per.cell,temp.reads)
}
hist(reads.per.cell, main = "", xlab = "Summation of TPM per cell", breaks=50)
genes.per.cell = NULL
for (i in 1:82) {
ss = cell.names[grouping == i]
temp.counts = unlist(sf_tpm[, lapply(.SD, FUN = function(x) sum(x>0)), .SDcols = ss])
genes.per.cell = c(genes.per.cell,temp.counts)
}
hist(genes.per.cell, main = "", xlab = "Number of expressed genes per cell", breaks=50)
rm(grouping)
gene.grouping = rep(1:28, each = 2000)[1:dim(sf_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
temp.counts = rowSums(sf_tpm[gene.grouping==i,] > 0)
cells.per.gene = c(cells.per.gene,temp.counts)
}
summary(cells.per.gene)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5 30 704 374 16291
table(cells.per.gene == 0)
##
## FALSE TRUE
## 50513 5224
table(cells.per.gene == 1)
##
## FALSE TRUE
## 53323 2414
table(cells.per.gene <= 5)
##
## FALSE TRUE
## 40941 14796
hist(log10(cells.per.gene), main = "", breaks=50,
xlab = "log10(number of cells) each gene is expressed")
rm(gene.grouping)
plot the variance of the expression levels for each gene against its mean expression level
# kept 12705 genes
large.express.genes.all = gene.names[which(cells.per.gene > 500)]
large.expressed.tpm = sf_tpm[match(large.express.genes.all,rownames(sf_tpm)),]
df_mv = data.frame(gene=large.express.genes.all,
mean=rowMeans(large.expressed.tpm),
var=apply(large.expressed.tpm, 1, var))
ggplot(data = df_mv) +
geom_point(aes(x=mean,y=var),size = 0.1)
rm(large.expressed.tpm)
Next we divide the genes into 20 equal-size bins based on their mean expression levels. In each bin, we select the first 50 genes with the highest variance
df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
##
## [0.0206,0.149] (0.149,0.195] (0.195,0.235] (0.235,0.277] (0.277,0.323]
## 636 635 635 635 636
## (0.323,0.378] (0.378,0.436] (0.436,0.504] (0.504,0.576] (0.576,0.666]
## 635 635 635 635 636
## (0.666,0.769] (0.769,0.886] (0.886,1.02] (1.02,1.18] (1.18,1.38]
## 635 635 635 635 636
## (1.38,1.64] (1.64,1.98] (1.98,2.57] (2.57,3.72] (3.72,16.4]
## 635 635 635 635 636
hv.genes.list = by(df_mv, df_mv$grouping,
FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes.all = as.character(unlist(hv.genes.list))
rm(hv.genes.list)
hvg_tpm = sf_tpm[match(hv.genes.all,rownames(sf_tpm)),]
hvg_tpm = t(hvg_tpm)
colnames(hvg_tpm) = hv.genes.all
dim(hvg_tpm)
## [1] 16291 1000
hvg_tpm[1:2,1:5]
## ZNF829 PHLDB1 RP11-4F22.2 GBAP1 ULK2
## A10_P3_M11 0 0 0 0 0
## A11_P1_M11 0 0 0 0 0
rm(df_mv)
df_mv2 = data.frame(gene=hv.genes.all, mean=colMeans(hvg_tpm),
var=apply(hvg_tpm, 2, var))
ggplot(data = df_mv2) +
geom_point(aes(x=mean,y=var),size = 0.1)
hvg_tpm = log(hvg_tpm + 1)
pca.hvg = princomp(hvg_tpm, cor = TRUE)
eigvals = pca.hvg$sdev^2
pve = eigvals/sum(eigvals)
plot(1:50, pve[1:50], main = "scree plot (first 50 PCs)", type="b",
xlab="i-th PC", ylab="Proportion of Variance Explained")
yvec = cumsum(pve)
plot(yvec, type="l", main = "plot of cumulative proportion of variance",
xlab = "i-th Principal Components",
ylab = "Cumulative variance", ylim=c(0,1))
abline(h=0.8, lty = 2)
plot(yvec[1:100], type="l", main = "plot of cumulative proportion of variance",
xlab = "i-th Principal Components",
ylab = "Cumulative variance")
pca.scores = pca.hvg$scores
rm(pca.hvg)
top50pcs = pca.scores[,1:50]
rm(pca.scores)
set.seed(9999)
tsne = Rtsne(top50pcs, pca=FALSE)
df_tsne = as.data.frame(tsne$Y)
names(df_tsne) = paste0("topPC_TSNE", seq(ncol(tsne$Y)))
rm(tsne)
pcs_umap = umap(top50pcs)
df_umap = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)
Prepare dataframe for k-means clustering using top 50 PCs
df_top50PCs = cbind(top50pcs, df_tsne)
df_top50PCs = cbind(df_top50PCs, df_umap)
rm(df_tsne)
rm(df_umap)
dim(df_top50PCs)
## [1] 16291 54
df_top50PCs[1:2, c(1:2,48:52)]
Try Kmeans for 6 to 12 clusters
set.seed(9999)
all_num_clust = c(6:12)
for (num_clust in all_num_clust) {
cat(paste0("KM with ",num_clust," clusters.\n"))
kmeans_out = kmeans(top50pcs, centers = num_clust, iter.max = 1e3,
nstart = 50, algorithm = "MacQueen")
print(kmeans_out[c("betweenss","tot.withinss")])
km_label = paste0("KM_",num_clust)
df_top50PCs[[km_label]] = as.factor(kmeans_out$cluster)
pt = ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2,
color=eval(as.name(paste(km_label)))), size = 0.1) +
scale_colour_discrete(name=paste(km_label)) +
guides(color = guide_legend(override.aes = list(size = 2)))
print(pt)
pt = ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_TSNE1, y=topPC_TSNE2,
color=eval(as.name(paste(km_label)))), size = 0.1) +
scale_colour_discrete(name=paste(km_label)) +
guides(color = guide_legend(override.aes = list(size = 2)))
print(pt)
rm(kmeans_out)
rm(pt)
}
## KM with 6 clusters.
## $betweenss
## [1] 1784958
##
## $tot.withinss
## [1] 2031680
## KM with 7 clusters.
## $betweenss
## [1] 1883331
##
## $tot.withinss
## [1] 1933306
## KM with 8 clusters.
## $betweenss
## [1] 1968328
##
## $tot.withinss
## [1] 1848310
## KM with 9 clusters.
## $betweenss
## [1] 2038536
##
## $tot.withinss
## [1] 1778102
## KM with 10 clusters.
## $betweenss
## [1] 2082016
##
## $tot.withinss
## [1] 1734621
## KM with 11 clusters.
## $betweenss
## [1] 2117752
##
## $tot.withinss
## [1] 1698885
## KM with 12 clusters.
## $betweenss
## [1] 2150178
##
## $tot.withinss
## [1] 1666459
Compared to the clustering result from Sade-Feldman et al., 2018
xnm = "1-s2.0-S0092867418313941-mmc1.xlsx"
table_s1 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm),
sheet = "Cluster annotation-Fig1B-C", col_names = TRUE)
dim(df_top50PCs)
## [1] 16291 61
dim(table_s1)
## [1] 16291 2
table_s1[1:2,]
table(table_s1$`Cluster number`)
##
## 1 2 3 4 5 6 7 8 9 10 11
## 1455 305 1391 290 2165 2222 1740 2165 1656 1773 1129
clabel = c("B_cells", "Plasma_cells", "Monocytes_Macrophages", "Dendritic_cells",
"Lymphocytes", "Exhausted_CD8T", "Tregs", "Cytotoxicity",
"Exhausted_HS_CD8T", "memory_T", "Lymphocytes_cell_cycle")
table_s1$cluster_label = clabel[table_s1$`Cluster number`]
dim(table_s1)
## [1] 16291 3
table_s1[1:2,]
table(rownames(df_top50PCs) == table_s1$`Cell Name`)
##
## FALSE TRUE
## 991 15300
setequal(rownames(df_top50PCs), table_s1$`Cell Name`)
## [1] FALSE
cell_nm_check = cbind(rownames(df_top50PCs), table_s1$`Cell Name`)
w2check = which(cell_nm_check[,1] != cell_nm_check[,2])
cell_nm_check = cell_nm_check[w2check,]
dim(cell_nm_check)
## [1] 991 2
cell_nm_check[c(1:2,501:502, 990:991),]
## [,1] [,2]
## [1,] "A3_P7_MMD9_L001_myeloid_enriched" "A3_P7_MMD9_L001"
## [2,] "A5_P7_MMD9_L001_myeloid_enriched" "A5_P7_MMD9_L001"
## [3,] "E3_P12_MMD2-36B_T_enriched" "E3_P12_MMD2-36B_DN"
## [4,] "E4_P12_MMD2-36B_T_enriched" "E4_P12_MMD2-36B_DN"
## [5,] "H8_P5_M67_L001_T_enriched" "H8_P5_M67_L001"
## [6,] "H9_P5_M67_L001_T_enriched" "H9_P5_M67_L001"
cell_nm_check[,1] = gsub("_myeloid_enriched", "", cell_nm_check[,1], fixed=TRUE)
cell_nm_check[,1] = gsub("_T_enriched", "", cell_nm_check[,1], fixed=TRUE)
cell_nm_check[,2] = gsub("_DN$", "", cell_nm_check[,2], perl=TRUE)
cell_nm_check[,2] = gsub("_DN1$", "", cell_nm_check[,2], perl=TRUE)
cell_nm_check[,2] = gsub("_DP1$", "", cell_nm_check[,2], perl=TRUE)
table(cell_nm_check[,1] == cell_nm_check[,2])
##
## TRUE
## 991
df_top50PCs$sf_cluster = as.factor(table_s1$`Cluster number`)
df_top50PCs$sf_label = as.factor(table_s1$cluster_label)
Cluster annotation from Sade-Feldman et al., 2018, 11 clusters
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2, color=sf_label),
size = 0.1) +
guides(color = guide_legend(override.aes = list(size = 2)))
Cluster annotation from my Kmeans analysis, 12 clusters
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1, y=topPC_UMAP2, color=KM_12), size = 0.1) +
guides(color = guide_legend(override.aes = list(size = 2)))
tb1 = table(df_top50PCs$KM_12, df_top50PCs$sf_label)
tb1
##
## B_cells Cytotoxicity Dendritic_cells Exhausted_CD8T Exhausted_HS_CD8T
## 1 1 667 0 13 11
## 2 1 1 0 0 0
## 3 2 6 0 28 8
## 4 0 0 2 1 0
## 5 5 58 1 44 33
## 6 1 1048 0 549 897
## 7 1 1 0 0 1
## 8 0 0 282 0 0
## 9 15 167 5 43 20
## 10 2 215 0 1542 686
## 11 0 0 0 0 0
## 12 1427 2 0 2 0
##
## Lymphocytes Lymphocytes_cell_cycle memory_T Monocytes_Macrophages
## 1 244 7 16 0
## 2 1 0 0 0
## 3 4 806 5 7
## 4 1 0 0 603
## 5 32 92 138 5
## 6 1163 20 130 1
## 7 1 0 0 399
## 8 0 0 0 16
## 9 683 5 1442 31
## 10 35 190 36 0
## 11 0 2 0 329
## 12 1 7 6 0
##
## Plasma_cells Tregs
## 1 0 4
## 2 300 0
## 3 3 17
## 4 0 1
## 5 0 1271
## 6 0 33
## 7 0 1
## 8 0 0
## 9 1 395
## 10 0 14
## 11 0 0
## 12 1 4
rowSums(tb1 > 100)
## 1 2 3 4 5 6 7 8 9 10 11 12
## 2 1 1 1 2 5 1 1 4 4 1 1
colSums(tb1 > 100)
## B_cells Cytotoxicity Dendritic_cells
## 1 4 1
## Exhausted_CD8T Exhausted_HS_CD8T Lymphocytes
## 2 2 3
## Lymphocytes_cell_cycle memory_T Monocytes_Macrophages
## 2 3 3
## Plasma_cells Tregs
## 1 2
cell_type = data.frame(cell = cell.names, type1 = NA)
for(ct1 in c("B_cells", "Dendritic_cells", "Monocytes_Macrophages", "Plasma_cells")){
kmc = which(tb1[,ct1] > 100)
cell_type$type1[which(df_top50PCs$KM_12 %in% kmc & df_top50PCs$sf_label==ct1)] = ct1
}
for(ct1 in c("Tregs", "memory_T")){
kmc = which.max(tb1[,ct1])
cell_type$type1[which(df_top50PCs$KM_12 %in% kmc & df_top50PCs$sf_label==ct1)] = ct1
}
table(cell_type$type1, df_top50PCs$KM_12)
##
## 1 2 3 4 5 6 7 8 9 10 11
## B_cells 0 0 0 0 0 0 0 0 0 0 0
## Dendritic_cells 0 0 0 0 0 0 0 282 0 0 0
## memory_T 0 0 0 0 0 0 0 0 1442 0 0
## Monocytes_Macrophages 0 0 0 603 0 0 399 0 0 0 329
## Plasma_cells 0 300 0 0 0 0 0 0 0 0 0
## Tregs 0 0 0 0 1271 0 0 0 0 0 0
##
## 12
## B_cells 1427
## Dendritic_cells 0
## memory_T 0
## Monocytes_Macrophages 0
## Plasma_cells 0
## Tregs 0
rm(top50pcs)
rm(df_top50PCs)
# Cell cluster annotation of CD8+ T cells
table_s2 = read.csv("output/CD8_cluster.csv", stringsAsFactors = FALSE)
dim(table_s2)
## [1] 6350 3
table_s2[1:2,]
table(table_s2$Cluster, table_s2$ruoyi_cluster, useNA="ifany")
##
## CD8_B CD8_G <NA>
## CD8_B 2351 0 672
## CD8_G 0 3046 281
CD8_B = table_s2$Cell.Name[which(table_s2$ruoyi_cluster=="CD8_B")]
CD8_G = table_s2$Cell.Name[which(table_s2$ruoyi_cluster=="CD8_G")]
table(CD8_B %in% cell_type$cell)
##
## TRUE
## 2351
table(CD8_G %in% cell_type$cell)
##
## TRUE
## 3046
cell_type$type2 = rep("others", nrow(cell_type))
cell_type$type2[match(CD8_B,cell_type$cell)] = "CD8T_B"
cell_type$type2[match(CD8_G,cell_type$cell)] = "CD8T_G"
table(cell_type$type1, cell_type$type2, useNA="ifany")
##
## CD8T_B CD8T_G others
## B_cells 0 0 1427
## Dendritic_cells 1 0 281
## memory_T 2 341 1099
## Monocytes_Macrophages 1 0 1330
## Plasma_cells 11 5 284
## Tregs 14 5 1252
## <NA> 2322 2695 5221
First, I get all the CD8 and NK cells, corresponding to the cells in clusters 5 (Lymphocytes), 6 (Exhausted_CD8T), 8 (Cytotoxicity), 9 (Exhausted_HS_CD8T) and 11 (Lymphocytes_cell_cycle) in Sade_Feldman et al. (2018). There are 9337 such cells. Then, I excluded the CD8 T cells using the annotation in the paper. 3513 cells remained after excluding the CD8 T cells.
table(paste(table_s1$`Cluster number`, table_s1$cluster_label))
##
## 1 B_cells 10 memory_T 11 Lymphocytes_cell_cycle
## 1455 1773 1129
## 2 Plasma_cells 3 Monocytes_Macrophages 4 Dendritic_cells
## 305 1391 290
## 5 Lymphocytes 6 Exhausted_CD8T 7 Tregs
## 2165 2222 1740
## 8 Cytotoxicity 9 Exhausted_HS_CD8T
## 2165 1656
CD8.NK.cells = cell.names[as.numeric(table_s1$`Cluster number`) %in% c(5,6,8,9,11)]
TNK.cells = CD8.NK.cells[! CD8.NK.cells %in% table_s2$Cell.Name]
TNKcells_tpm = sf_tpm[, ..TNK.cells]
row.names(TNKcells_tpm) = gene.names
dim(TNKcells_tpm)
## [1] 55737 3513
TNKcells_tpm[1:2,1:5]
gene.grouping = rep(1:28, each = 2000)[1:dim(TNKcells_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
temp.counts = apply(TNKcells_tpm[gene.grouping==i,], 1, FUN = function(x) sum(x>0))
cells.per.gene = c(cells.per.gene,temp.counts)
}
table(cells.per.gene == 0)
##
## FALSE TRUE
## 43842 11895
table(cells.per.gene == 1)
##
## FALSE TRUE
## 49939 5798
hist(log10(cells.per.gene), main = "", breaks = 50,
xlab = "log10(number of cells) each gene is expressed")
rm(gene.grouping)
plot the variance of the expression levels for each gene against its mean expression level
# kept 10925 genes
genes2kp = which(cells.per.gene > 150)
large.express.genes.TNK = rownames(TNKcells_tpm)[genes2kp]
large.expressed.tpm = TNKcells_tpm[genes2kp,]
dim(large.expressed.tpm)
## [1] 10925 3513
df_mv = data.frame(gene=large.express.genes.TNK,
mean=rowMeans(large.expressed.tpm),
var=apply(large.expressed.tpm, 1, var))
rm(large.expressed.tpm)
ggplot(data = df_mv) +
geom_point(aes(x=mean,y=var),size = 0.1)
Next, divide the genes into 20 equal-size bins based on their mean expression levels, and select the first 50 genes with the highest variance from each bin.
df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
##
## [0.0226,0.215] (0.215,0.281] (0.281,0.33] (0.33,0.378] (0.378,0.432]
## 547 546 546 546 547
## (0.432,0.494] (0.494,0.562] (0.562,0.64] (0.64,0.727] (0.727,0.822]
## 546 546 549 543 547
## (0.822,0.932] (0.932,1.05] (1.05,1.2] (1.2,1.38] (1.38,1.6]
## 546 546 546 546 547
## (1.6,1.87] (1.87,2.26] (2.26,2.92] (2.92,4.2] (4.2,16.4]
## 546 546 546 546 547
hv.genes.list = by(df_mv, df_mv$grouping,
FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes.TNK = as.character(unlist(hv.genes.list))
rm(hv.genes.list)
hvg_TNKcells_tpm = TNKcells_tpm[match(hv.genes.TNK, rownames(TNKcells_tpm)),]
hvg_TNKcells_tpm = t(hvg_TNKcells_tpm)
colnames(hvg_TNKcells_tpm) = hv.genes.TNK
rm(df_mv)
dim(hvg_TNKcells_tpm)
## [1] 3513 1000
hvg_TNKcells_tpm[1:2,1:4]
## ZNF816 AP000476.1 ZNF549 MLTK
## A10_P3_M11 0 0 0 0
## A11_P1_M11 0 0 0 0
df_mv2 = data.frame(gene=hv.genes.TNK,
mean=colMeans(hvg_TNKcells_tpm),
var=apply(hvg_TNKcells_tpm, 2, var))
ggplot(data = df_mv2) +
geom_point(aes(x=mean,y=var),size = 0.1)
pca.hvg = princomp(log(hvg_TNKcells_tpm+1), cor = TRUE)
rm(hvg_TNKcells_tpm)
pca.scores = pca.hvg$scores
rm(pca.hvg)
top50pcs_TNKcells = pca.scores[,1:50]
rm(pca.scores)
pcs_umap = umap(top50pcs_TNKcells)
df_umap = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)
Prepare dataframe for k-means clustering using top 50 PCs
df_top50PCs_TNKcells = cbind(top50pcs_TNKcells, df_umap)
rm(df_umap)
dim(df_top50PCs_TNKcells)
## [1] 3513 52
df_top50PCs_TNKcells[1:2,c(1:2,50:52)]
set.seed(9999)
all_num_clust = c(2:6)
for (num_clust in all_num_clust) {
cat(paste0("KM with ",num_clust," clusters.\n"))
kmeans_out = kmeans(top50pcs_TNKcells, centers = num_clust,
iter.max = 1e3,
nstart = 50, algorithm = "MacQueen")
print(kmeans_out[c("betweenss","tot.withinss")])
km_label = paste0("KM_",num_clust)
df_top50PCs_TNKcells[[km_label]] = as.factor(kmeans_out$cluster)
pt = ggplot(data=df_top50PCs_TNKcells) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,
color=eval(as.name(paste(km_label)))), size = 0.1) +
scale_colour_discrete(name=paste(km_label)) +
guides(color = guide_legend(override.aes = list(size = 2)))
print(pt)
rm(kmeans_out)
rm(pt)
}
## KM with 2 clusters.
## $betweenss
## [1] 168403.8
##
## $tot.withinss
## [1] 573296.8
## KM with 3 clusters.
## $betweenss
## [1] 206432.2
##
## $tot.withinss
## [1] 535268.5
## KM with 4 clusters.
## $betweenss
## [1] 231524.6
##
## $tot.withinss
## [1] 510176
## KM with 5 clusters.
## $betweenss
## [1] 246807.8
##
## $tot.withinss
## [1] 494892.9
## KM with 6 clusters.
## $betweenss
## [1] 259523.3
##
## $tot.withinss
## [1] 482177.3
df_top50PCs_TNKcells$sf_cluster =
as.factor(table_s1$cluster_label[match(TNK.cells, cell.names)])
ggplot(data=df_top50PCs_TNKcells) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=sf_cluster),size = 0.1) +
guides(color = guide_legend(override.aes = list(size = 2)))
The clustering results from k-means analysis using 3-6 clusters are generally very similar: G5-Lymphocytes and G8-Cytotoxicity form one cluster, G6-Exhausted_CD8T and G9-Exhausted_HS_CD8T form one cluster, and G11-Lymphocytes_cell_cycle forms an individual cluster. Using 3 clusters would be suffice.
NK1 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==1]
NK2 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==2]
NK3 = row.names(df_top50PCs_TNKcells)[df_top50PCs_TNKcells$KM_3==3]
cell_type$type2[match(NK1,cell_type$cell)] = "NK1"
cell_type$type2[match(NK2,cell_type$cell)] = "NK2"
cell_type$type2[match(NK3,cell_type$cell)] = "NK3"
dim(cell_type)
## [1] 16291 3
cell_type[1:2,]
table(cell_type$type1, useNA = "ifany")
##
## B_cells Dendritic_cells memory_T
## 1427 282 1442
## Monocytes_Macrophages Plasma_cells Tregs
## 1331 300 1271
## <NA>
## 10238
table(cell_type$type2, useNA = "ifany")
##
## CD8T_B CD8T_G NK1 NK2 NK3 others
## 2351 3046 456 1961 1096 7381
table(cell_type$type1, cell_type$type2, useNA = "ifany")
##
## CD8T_B CD8T_G NK1 NK2 NK3 others
## B_cells 0 0 0 0 0 1427
## Dendritic_cells 1 0 0 0 0 281
## memory_T 2 341 0 0 0 1099
## Monocytes_Macrophages 1 0 0 0 0 1330
## Plasma_cells 11 5 0 0 0 284
## Tregs 14 5 0 0 0 1252
## <NA> 2322 2695 456 1961 1096 1708
Merge the cell type definition from the Kmeans clustering of all the cells (cell_type$type1) and the Kmean clustering of the CD8+ T cells and NK cells (cell_type$type2).
cell_type$type1[which(is.na(cell_type$type1))] = "unclustered"
# remove the cluster membership for any cells classified as CD8T or NK
cell_type$type = cell_type$type1
cell_type$type[which(cell_type$type2 != "others")] = "unclustered"
# now only CD4+ T cells are left for T memory cells
cell_type$type[which(cell_type$type == "memory_T")] = "CD4T_memory"
# add labels of CD8+ T and NK cells
ct_TNK = (cell_type$type1 == "unclustered")
cell_type$type[which(ct_TNK & cell_type$type2 == "CD8T_B")] = "CD8T_B"
ct_TmemNK = (cell_type$type1 %in% c("memory_T", "unclustered"))
cell_type$type[which(ct_TmemNK & cell_type$type2 == "CD8T_G")] = "CD8T_G"
cell_type$type[which(cell_type$type2 == "NK1")] = "NK1"
cell_type$type[which(cell_type$type2 == "NK2")] = "NK2"
cell_type$type[which(cell_type$type2 == "NK3")] = "NK3"
table(cell_type$type, cell_type$type1, useNA = "ifany")
##
## B_cells Dendritic_cells memory_T Monocytes_Macrophages
## B_cells 1427 0 0 0
## CD4T_memory 0 0 1099 0
## CD8T_B 0 0 0 0
## CD8T_G 0 0 341 0
## Dendritic_cells 0 281 0 0
## Monocytes_Macrophages 0 0 0 1330
## NK1 0 0 0 0
## NK2 0 0 0 0
## NK3 0 0 0 0
## Plasma_cells 0 0 0 0
## Tregs 0 0 0 0
## unclustered 0 1 2 1
##
## Plasma_cells Tregs unclustered
## B_cells 0 0 0
## CD4T_memory 0 0 0
## CD8T_B 0 0 2322
## CD8T_G 0 0 2695
## Dendritic_cells 0 0 0
## Monocytes_Macrophages 0 0 0
## NK1 0 0 456
## NK2 0 0 1961
## NK3 0 0 1096
## Plasma_cells 284 0 0
## Tregs 0 1252 0
## unclustered 16 19 1708
table(cell_type$type, cell_type$type2, useNA = "ifany")
##
## CD8T_B CD8T_G NK1 NK2 NK3 others
## B_cells 0 0 0 0 0 1427
## CD4T_memory 0 0 0 0 0 1099
## CD8T_B 2322 0 0 0 0 0
## CD8T_G 0 3036 0 0 0 0
## Dendritic_cells 0 0 0 0 0 281
## Monocytes_Macrophages 0 0 0 0 0 1330
## NK1 0 0 456 0 0 0
## NK2 0 0 0 1961 0 0
## NK3 0 0 0 0 1096 0
## Plasma_cells 0 0 0 0 0 284
## Tregs 0 0 0 0 0 1252
## unclustered 29 10 0 0 0 1708
cluster_ct = cell_type$type[match(names(sf_tpm), cell_type$cell)]
plot1 <- function(gene1, edata, cluster_ct){
y = as.numeric(edata[which(rownames(edata) == gene1),])
df1 = data.frame(expression = y, cell_type = cluster_ct)
g1 = ggplot(df1, aes(x=cell_type, y=expression, fill=cell_type)) +
geom_boxplot() + coord_flip() + theme(legend.position = "none") +
ggtitle(gene1)
print(g1)
}
plot2 <- function(gene1, gene2, edata, cluster_ct, cluster2use){
y1 = as.numeric(edata[which(rownames(edata) == gene1),])
y2 = as.numeric(edata[which(rownames(edata) == gene2),])
df1 = data.frame(y1=y1, y2=y2, cell_type = cluster_ct)
df1 = df1[which(df1$cell_type %in% cluster2use),]
g1 = ggplot(df1, aes(x=y1, y=y2, color=cell_type)) +
geom_point(size=0.4) + xlab(gene1) + ylab(gene2)
print(g1)
}
plot1("FOXP3", sf_tpm, cluster_ct)
plot1("GZMA", sf_tpm, cluster_ct)
plot1("PDCD1", sf_tpm, cluster_ct)
plot1("CTLA4", sf_tpm, cluster_ct)
plot1("LAG3", sf_tpm, cluster_ct)
plot1("CD3E", sf_tpm, cluster_ct)
plot1("CD4", sf_tpm, cluster_ct)
plot1("CD8A", sf_tpm, cluster_ct)
plot1("CD8B", sf_tpm, cluster_ct)
plot1("CD33", sf_tpm, cluster_ct)
plot1("CD14", sf_tpm, cluster_ct)
plot1("CD19", sf_tpm, cluster_ct)
plot1("CD8A", sf_tpm, cluster_ct)
plot1("CD8B", sf_tpm, cluster_ct)
cluster2use = c("CD8T_B", "CD8T_G", "NK1", "NK2", "NK3")
plot2("CD8A", "CD8B", sf_tpm, cluster_ct, cluster2use)
# CD56
plot1("NCAM1", sf_tpm, cluster_ct)
# CD16
plot1("FCGR3A", sf_tpm, cluster_ct)
# NKP46
plot1("NCR1", sf_tpm, cluster_ct)
Next check 13 marker genes from > Crinier, Adeline, et al. “High-dimensional single-cell analysis identifies organ-specific signatures and conserved NK cell subsets in humans and mice.” Immunity 49.5 (2018): 971-986.
plot1("CD160", sf_tpm, cluster_ct)
plot1("CD244", sf_tpm, cluster_ct)
plot1("CHST12", sf_tpm, cluster_ct)
plot1("CST7", sf_tpm, cluster_ct)
plot1("GNLY", sf_tpm, cluster_ct)
plot1("IL18RAP", sf_tpm, cluster_ct)
plot1("IL2RB", sf_tpm, cluster_ct)
# NKG2A
plot1("KLRC1", sf_tpm, cluster_ct)
# NKG2E
plot1("KLRC3", sf_tpm, cluster_ct)
# CD94
plot1("KLRD1", sf_tpm, cluster_ct)
# NKp80
plot1("KLRF1", sf_tpm, cluster_ct)
plot1("PRF1", sf_tpm, cluster_ct)
plot1("XCL2", sf_tpm, cluster_ct)
plot2("GNLY", "CD8A", sf_tpm, cluster_ct, cluster2use)
Next additional marker genes from > Freud, Aharon G., et al. “The broad spectrum of human natural killer cell diversity.” Immunity 47.5 (2017): 820-833.
# NKG2C
plot1("KLRC2", sf_tpm, cluster_ct)
# NKG2D
plot1("KLRK1", sf_tpm, cluster_ct)
plot2("KLRK1", "CD8A", sf_tpm, cluster_ct, cluster2use)
plot1("EOMES", sf_tpm, cluster_ct)
plot2("EOMES", "CD8A", sf_tpm, cluster_ct, cluster2use)
# T-BET
plot1("TBX21", sf_tpm, cluster_ct)
# KIR genes
plot1("KIR3DL1", sf_tpm, cluster_ct)
plot1("KIR3DL2", sf_tpm, cluster_ct)
plot1("KIR3DL3", sf_tpm, cluster_ct)
plot1("KIR3DP1", sf_tpm, cluster_ct)
table(cell_type$type1[cell_type$type2=="others"])
##
## B_cells Dendritic_cells memory_T
## 1427 281 1099
## Monocytes_Macrophages Plasma_cells Tregs
## 1330 284 1252
## unclustered
## 1708
write.table(cell_type, "output/cell_type.txt", quote = FALSE,
col.names = TRUE, row.names = FALSE,
na = "unclustered", sep = "\t")
G1 B cells, G2 Plasma cells, G3 Monocytes/Macrophages, G4 Dendritic cells, G7 Regulatory T cells and G10 Memory T-cells could be clearly identified using k-means clustering based on the PCA of high variance genes. The tricky part is to distinguish NK cells and CD8 cells.
Recluster G5 - G11 cells didn’t really improved the result, and G7 and G10 could also be roughly identified when clustering on all cells, so we changed to recluster cells in G5, G6, G8, G9 and G11.
Recluster G5, G6, G8, G9, G11 still didn’t give a clear clustering for NK cells, so we moved to excluding all CD8 T cells based on the annotation of the paper, and then cluster the remaining cells.
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2160295 115.4 5734478 306.3 NA 5734478 306.3
## Vcells 1216409633 9280.5 2129235534 16244.8 32768 2129214556 16244.7
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] umap_0.2.7.0 ggcorrplot_0.1.3 ggplot2_3.3.1 R.utils_2.9.2
## [5] R.oo_1.23.0 R.methodsS3_1.8.0 Rtsne_0.15 data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 RSpectra_0.16-0 cellranger_1.1.0 compiler_3.6.2
## [5] pillar_1.4.3 tools_3.6.2 digest_0.6.23 jsonlite_1.6.1
## [9] lattice_0.20-38 evaluate_0.14 lifecycle_0.2.0 tibble_3.0.1
## [13] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.6 Matrix_1.2-18
## [17] yaml_2.2.1 xfun_0.12 withr_2.1.2 stringr_1.4.0
## [21] dplyr_0.8.4 knitr_1.28 askpass_1.1 vctrs_0.3.0
## [25] grid_3.6.2 tidyselect_1.0.0 reticulate_1.15 glue_1.3.1
## [29] R6_2.4.1 readxl_1.3.1 rmarkdown_2.1 farver_2.0.3
## [33] purrr_0.3.3 magrittr_1.5 scales_1.1.0 htmltools_0.4.0
## [37] ellipsis_0.3.0 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3
## [41] stringi_1.4.5 openssl_1.4.1 munsell_0.5.0 crayon_1.3.4